{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Merge data for Science, Poetry, and Nonfic:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import os\n",
    "path = '/Users/sunyambagga/Desktop/txtLAB-2/detecting-narrativity/data/hand-annotate/'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(450, 3)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>GENRE</th>\n",
       "      <th>FILENAME</th>\n",
       "      <th>TEXT</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>19CNONFIC</td>\n",
       "      <td>19CNONFIC_5S_chi-086462549-norm.txt</td>\n",
       "      <td>Provided always, That the contracts of such Co...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>19CNONFIC</td>\n",
       "      <td>19CNONFIC_5S_chi-086596372-norm.txt</td>\n",
       "      <td>- 2 \\n. W \\n- . \"*\"' - 32. The coeﬃcients ſoun...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>19CNONFIC</td>\n",
       "      <td>19CNONFIC_5S_chi-086756702-norm.txt</td>\n",
       "      <td>It is referred to in the \\n“Dinnseanchus, ” an...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>19CNONFIC</td>\n",
       "      <td>19CNONFIC_5S_chi-086785002-norm.txt</td>\n",
       "      <td>It is said to be imbricated. Compare the peria...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>19CNONFIC</td>\n",
       "      <td>19CNONFIC_5S_chi-086857689-norm.txt</td>\n",
       "      <td>- .~ \\nhi - \\nf‘. w \\nHUMMINGBIRDS. 215 \\n\\n\\n...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       GENRE                             FILENAME  \\\n",
       "0  19CNONFIC  19CNONFIC_5S_chi-086462549-norm.txt   \n",
       "1  19CNONFIC  19CNONFIC_5S_chi-086596372-norm.txt   \n",
       "2  19CNONFIC  19CNONFIC_5S_chi-086756702-norm.txt   \n",
       "3  19CNONFIC  19CNONFIC_5S_chi-086785002-norm.txt   \n",
       "4  19CNONFIC  19CNONFIC_5S_chi-086857689-norm.txt   \n",
       "\n",
       "                                                TEXT  \n",
       "0  Provided always, That the contracts of such Co...  \n",
       "1  - 2 \\n. W \\n- . \"*\"' - 32. The coeﬃcients ſoun...  \n",
       "2  It is referred to in the \\n“Dinnseanchus, ” an...  \n",
       "3  It is said to be imbricated. Compare the peria...  \n",
       "4  - .~ \\nhi - \\nf‘. w \\nHUMMINGBIRDS. 215 \\n\\n\\n...  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def read_txt(p):\n",
    "    with open(p, 'r') as F:\n",
    "        txt = F.read()\n",
    "    return txt\n",
    "\n",
    "def create_table():\n",
    "    lists = []\n",
    "    for folder in os.listdir(path):\n",
    "        if folder == 'historical-corpora' or folder.startswith('.') or not os.path.isdir(path+folder):\n",
    "            continue\n",
    "        \n",
    "        for fname in os.listdir(path+folder):\n",
    "            if not fname.endswith('.txt'):\n",
    "                continue\n",
    "            genre = fname.split('_')[0]\n",
    "\n",
    "            txt = read_txt(path+folder+'/'+fname)\n",
    "            lists.append([genre, fname, txt])\n",
    "            \n",
    "    df = pd.DataFrame(lists, columns = ['GENRE', 'FILENAME', 'TEXT'])\n",
    "    return df\n",
    "\n",
    "\n",
    "df = create_table()\n",
    "print(df.shape)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "##df.to_csv('/Users/sunyambagga/Desktop/txtLAB-2/detecting-narrativity/data/450passages_poetry_nonfic_science.tsv', sep='\\t', index=None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Write for Poetry (new 120; different authors)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(120, 3)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>GENRE</th>\n",
       "      <th>FILENAME</th>\n",
       "      <th>TEXT</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>POETRY</td>\n",
       "      <td>POETRY_5S_19C_100517_ImageSelwyn_SCIENTIACRESC...</td>\n",
       "      <td>\\n\\t\\tYet with unnumbered miracles hast thou ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>POETRY</td>\n",
       "      <td>POETRY_5S_19C_102530_ChiversTHThomasHolley_TOT...</td>\n",
       "      <td>I die! but not for want of bliss---\\n \\n\\t\\tBu...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>POETRY</td>\n",
       "      <td>POETRY_5S_19C_102991_BrackenThomas_OLDBENDIGO_...</td>\n",
       "      <td>\\n\\t\\tWhere are they now? mate, they'll drive...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>POETRY</td>\n",
       "      <td>POETRY_5S_19C_103625_HardyThomas_AMILITARYAPPO...</td>\n",
       "      <td>\\n\\t\\t\\n\"\\nSo\\nback you have come from the to...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>POETRY</td>\n",
       "      <td>POETRY_5S_19C_104186_WadeThomas_XXXVIIALAMENTF...</td>\n",
       "      <td>\\n\\t\\tThe ecstasy of Life is gone!  \\n\\t\\tLik...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    GENRE                                           FILENAME  \\\n",
       "0  POETRY  POETRY_5S_19C_100517_ImageSelwyn_SCIENTIACRESC...   \n",
       "1  POETRY  POETRY_5S_19C_102530_ChiversTHThomasHolley_TOT...   \n",
       "2  POETRY  POETRY_5S_19C_102991_BrackenThomas_OLDBENDIGO_...   \n",
       "3  POETRY  POETRY_5S_19C_103625_HardyThomas_AMILITARYAPPO...   \n",
       "4  POETRY  POETRY_5S_19C_104186_WadeThomas_XXXVIIALAMENTF...   \n",
       "\n",
       "                                                TEXT  \n",
       "0   \\n\\t\\tYet with unnumbered miracles hast thou ...  \n",
       "1  I die! but not for want of bliss---\\n \\n\\t\\tBu...  \n",
       "2   \\n\\t\\tWhere are they now? mate, they'll drive...  \n",
       "3   \\n\\t\\t\\n\"\\nSo\\nback you have come from the to...  \n",
       "4   \\n\\t\\tThe ecstasy of Life is gone!  \\n\\t\\tLik...  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def read_txt(p):\n",
    "    with open(p, 'r') as F:\n",
    "        txt = F.read()\n",
    "    return txt\n",
    "\n",
    "def create_table():\n",
    "    lists = []\n",
    "    for folder in os.listdir(path):\n",
    "        if folder != 'poetry':\n",
    "            continue\n",
    "        for fname in os.listdir(path+folder):\n",
    "            if not fname.endswith('.txt'):\n",
    "                continue\n",
    "            genre = fname.split('_')[0]\n",
    "\n",
    "            txt = read_txt(path+folder+'/'+fname)\n",
    "            lists.append([genre, fname, txt])\n",
    "            \n",
    "    df = pd.DataFrame(lists, columns = ['GENRE', 'FILENAME', 'TEXT'])\n",
    "    return df\n",
    "\n",
    "\n",
    "df = create_table()\n",
    "print(df.shape)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "#df.to_csv('/Users/sunyambagga/Desktop/txtLAB-2/detecting-narrativity/data/poetry_120passages_diffauthors.tsv',\n",
    "          sep='\\t', index=None)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
